{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "根据链家最近7天，最近30天看房记录，判断市场火热度，以及某房型的火热度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests,time,os,csv,re,logging\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "获取html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_html(url):\n",
    "    try:\n",
    "        r=requests.get(url,timeout=16)\n",
    "        status=r.status_code\n",
    "        r.raise_for_status()\n",
    "        r.encoding=r.apparent_encoding\n",
    "        return r.text\n",
    "    except:\n",
    "        now_time=time.strftime('%Y%m%d%H%M%S')\n",
    "        logging.info(now_time +'      ' +'get_html 异常, ' '    url：'+ url)   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "尝试获取n次"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_n_times(page_url,n=5):\n",
    "    #获取页面失败或服务器错误时，重试n次\n",
    "    i=1\n",
    "    sleep_time=0.1\n",
    "    index_page=get_html(page_url)\n",
    "    \n",
    "    while index_page==None or \"小差\" in index_page:\n",
    "        if i > n:\n",
    "            break\n",
    "        else:\n",
    "            time.sleep(sleep_time)\n",
    "            index_page=get_html(page_url)\n",
    "            sleep_time+=0.1\n",
    "            i+=1                \n",
    "                               \n",
    "    return index_page"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "获取某 tag 下总页数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_total_page(tag_url):\n",
    "    page_url=tag_url+'d1'\n",
    "    index_page=get_n_times(page_url)\n",
    "    \n",
    "    if index_page==None or \"小差\" in index_page:\n",
    "        total_page=101\n",
    "    else:\n",
    "        soup=BeautifulSoup(index_page,'html.parser')\n",
    "        taged_total_num=soup.find('div',attrs={'class':'resultDes clear'}).span.string   #共找到多少套\n",
    "#         total_page=int(taged_total_num)//30+1     #每页最多显示30套\n",
    "#         if total_page>100:\n",
    "#             total_page=100   #总数超过3000套只显示前3000套\n",
    "        page_data = soup.find('div',attrs={'class':'page-box house-lst-page-box'})['page-data']\n",
    "        total_page=eval(page_data)['totalPage']\n",
    "    return taged_total_num,total_page"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "页面解析并写入csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def parse_index_page(date,index_page,tag,page_num,total_page):\n",
    "    path=os.path.join(os.getcwd(),'results\\\\%s_index_info_all.csv')  %date #L2：两室，P21:200万以下\n",
    "    fieldnames=['date','tag','taged_total_num','total_page','page_num',\n",
    "                'key','title','total_price','total_price_unit','per_price','per_price_unit','prop1']\n",
    "    if  not os.path.exists(path):     #如果文件不存在，创建文件并写入表头\n",
    "        csv_file=open(path,'w',newline='')\n",
    "        writer=csv.DictWriter(csv_file,fieldnames=fieldnames)\n",
    "        writer.writeheader()\n",
    "        csv_file.close()\n",
    "        \n",
    "    csv_file=open(path,'a+',newline='')\n",
    "    writer=csv.DictWriter(csv_file,fieldnames=fieldnames)\n",
    "    \n",
    "    item={}\n",
    "    \n",
    "    soup=BeautifulSoup(index_page,'html.parser')\n",
    "    \n",
    "    #get_basic_info\n",
    "    \n",
    "    taged_total_num=soup.find('div',attrs={'class':'resultDes clear'}).span.string\n",
    "    \n",
    "    \n",
    "    \n",
    "    \n",
    "    #update_basic_info\n",
    "    item.update({'date':date,'tag':tag,'taged_total_num':taged_total_num,'total_page':total_page,'page_num':page_num})\n",
    "    \n",
    "    \n",
    "    index_info=soup.find_all('a',attrs={'class':'text link-hover-green js_triggerGray js_fanglist_title'})\n",
    "    for a in index_info:\n",
    "        key=a['key']\n",
    "        title=a['title']\n",
    "        item.update({'key':key,'title':title})\n",
    "        \n",
    "        price_div=a.parent.next_sibling.next_sibling\n",
    "        total_price=price_div.find('span',attrs={'class':'total-price strong-num'}).string\n",
    "        total_price_unit=price_div.find('span',attrs={'class':'unit'}).string\n",
    "        per_price_info=price_div.find('span',attrs={'class':'info-col price-item minor'}).string\n",
    "        per_price_info=re.search('(\\d+)(\\S{3})',per_price_info).groups()\n",
    "        per_price=per_price_info[0]\n",
    "        per_price_unit=per_price_info[1]\n",
    "        \n",
    "        prop_div=price_div.next_sibling.next_sibling\n",
    "        props=prop_div.find_all('span',attrs={'class':'c-prop-tag2'})\n",
    "        \n",
    "        if len(props)>0:\n",
    "            if \"距离\" in props[0].string:\n",
    "                prop1=props[0].string\n",
    "            else:\n",
    "                prop1=\"\"\n",
    "        else:\n",
    "            prop1=\"\"\n",
    "            \n",
    "#         prop2=props[1].string\n",
    "#         prop3=props[2].string\n",
    "        \n",
    "        item.update({'total_price':total_price,'total_price_unit':total_price_unit,'per_price':per_price,\n",
    "                    'per_price_unit':per_price_unit,'prop1':prop1})\n",
    "        \n",
    "        writer.writerow(item)    #定义writer2csv函数，每次调用函数写入item,结果是重复写入最后一个item，改用writer写入，无问题\n",
    "        \n",
    "    csv_file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "链接：\n",
    "\n",
    "西渡/两室/三室：https://sh.lianjia.com/ershoufang/xidu/l2l3/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "输入“限制条件”后的 `base_url` 输出所有的房子的数量，及 `id`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "base_url= 'https://sh.lianjia.com/ershoufang/xidu/l2l3/'\n",
    "date=time.strftime('%Y%m%d')\n",
    "# log_dir=os.path.join(os.getcwd(),'results/get_index.log')\n",
    "# logging.basicConfig(filename=log_dir,level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_thispage_ids(url):\n",
    "    house_ids=[]    #每运行一次要对house_ids重新指定一个空的列表，否则house_ids会累积上次运行的结果\n",
    "    index_page = get_n_times(url)\n",
    "    soup = BeautifulSoup(index_page, 'html.parser')\n",
    "    lis = soup.find_all('li', attrs={'class':'clear LOGCLICKDATA'})\n",
    "    for li in lis:\n",
    "        if li.contents[0].has_attr('data-housecode'):\n",
    "            house_code = li.contents[0]['data-housecode']\n",
    "            house_ids.append(house_code)\n",
    "        if li.contents[0].has_attr('data-lj_action_housedel_id'):\n",
    "            house_code = li.contents[0]['data-lj_action_housedel_id']\n",
    "            house_ids.append(house_code)\n",
    "    \n",
    "    return house_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_ids(base_url):\n",
    "    ids= []\n",
    "    n_house, n_page = get_total_page(base_url)\n",
    "    urls = [base_url[:-1]  +'pg' + str(i) + '/' for i in range(1,n_page+1)]\n",
    "    for url in urls:\n",
    "        thispage_ids = get_thispage_ids(url)\n",
    "        ids += thispage_ids\n",
    "#         ids.append(thispage_ids)  #此方法，只会更新变量thispage_ids最后的值\n",
    "    return n_house, ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "n_house, ids = get_ids(base_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' 212 '"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n_house"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "url_230='https://sh.lianjia.com/ershoufang/xidu/l2l3bp0ep230/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_house_230,ids_230 = get_ids(url_230)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "house_id = '107000859767'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "house_url = 'https://sh.lianjia.com/ershoufang/107100413346.html'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "house_page = get_n_times(house_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "soup = BeautifulSoup(house_page, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "divs = soup.find_all('div', attrs={'id':'record'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<div class=\"record\" id=\"record\">\n",
       " <div class=\"list\">\n",
       " <div class=\"title\">看房记录\n",
       "                   <span class=\"next disable\"><i></i></span>\n",
       " <span class=\"pre disable\"><i></i></span>\n",
       " </div>\n",
       " <div class=\"content\">\n",
       " <div class=\"record-header\">\n",
       " <div class=\"item mytime\">带看时间</div>\n",
       " <div class=\"item myname\">带看经纪人</div>\n",
       " <div class=\"item mytotal\">本房总带看</div>\n",
       " <div class=\"phone\" style=\"margin-left:12px;\">咨询电话</div>\n",
       " </div>\n",
       " <div class=\"row\"><span class=\"noData\">暂无看房记录</span></div>\n",
       " </div>\n",
       " </div>\n",
       " <div class=\"panel\">\n",
       " <div class=\"panel-title\">近7天带看次数</div>\n",
       " <div class=\"count\">0</div>\n",
       " <div class=\"totalCount\">- 30日带看<span>0</span>次 -</div>\n",
       " <!-- <div class=\"msyy\">马上预约看房</div> -->\n",
       " </div>\n",
       " </div>]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "divs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "aa = soup.find_all(text=re.compile('近7天.*?'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<div class=\"totalCount\">- 30日带看<span>0</span>次 -</div>"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "aa[0].next_sibling.next_sibling.next_sibling.next_sibling"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
